{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "76"
      ]
     },
     "execution_count": 1,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text1 = \"Ethics are built right into the ideals and objectives of the United Nations \"\n",
    "\n",
    "len(text1) # The length of text1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "14"
      ]
     },
     "execution_count": 2,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text2 = text1.split(' ') # Return a list of the words in text2, separating by ' '.\n",
    "\n",
    "len(text2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Ethics',\n",
       " 'are',\n",
       " 'built',\n",
       " 'right',\n",
       " 'into',\n",
       " 'the',\n",
       " 'ideals',\n",
       " 'and',\n",
       " 'objectives',\n",
       " 'of',\n",
       " 'the',\n",
       " 'United',\n",
       " 'Nations',\n",
       " '']"
      ]
     },
     "execution_count": 3,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "text2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Ethics',\n",
       " 'built',\n",
       " 'right',\n",
       " 'into',\n",
       " 'ideals',\n",
       " 'objectives',\n",
       " 'United',\n",
       " 'Nations']"
      ]
     },
     "execution_count": 4,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# List comprehension\n",
    "[w for w in text2 if len(w) > 3] # Words that are greater than 3 letters long in text2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Ethics', 'United', 'Nations']"
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[w for w in text2 if w.istitle()] # Capitalized words in text2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['Ethics', 'ideals', 'objectives', 'Nations']"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "[w for w in text2 if w.endswith('s')] # Words in text2 that end in 's'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "6"
      ]
     },
     "execution_count": 7,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# find unique words using set()\n",
    "text3 = 'To be or not to be'\n",
    "text4 = text3.split(' ')\n",
    "\n",
    "len(text4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "5"
      ]
     },
     "execution_count": 8,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(set(text4))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'To', 'be', 'not', 'or', 'to'}"
      ]
     },
     "execution_count": 9,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "set(text4)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "4"
      ]
     },
     "execution_count": 10,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(set([w.lower() for w in text4])) # .lower converts the string to lowercase."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "{'be', 'not', 'or', 'to'}"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "set([w.lower() for w in text4])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['\"Ethics',\n",
       " 'are',\n",
       " 'built',\n",
       " 'right',\n",
       " 'into',\n",
       " 'the',\n",
       " 'ideals',\n",
       " 'and',\n",
       " 'objectives',\n",
       " 'of',\n",
       " 'the',\n",
       " 'United',\n",
       " 'Nations\"',\n",
       " '#UNSG',\n",
       " '@',\n",
       " 'NY',\n",
       " 'Society',\n",
       " 'for',\n",
       " 'Ethical',\n",
       " 'Culture',\n",
       " 'bit.ly/2guVelr']"
      ]
     },
     "execution_count": 12,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# Processing free-text\n",
    "text5 = '\"Ethics are built right into the ideals and objectives of the United Nations\" \\\n",
    "#UNSG @ NY Society for Ethical Culture bit.ly/2guVelr'\n",
    "text6 = text5.split(' ')\n",
    "\n",
    "text6"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['#UNSG']"
      ]
     },
     "execution_count": 13,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# finding hashtags\n",
    "[w for w in text6 if w.startswith('#')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['@']"
      ]
     },
     "execution_count": 14,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "# finding callouts\n",
    "[w for w in text6 if w.startswith('@')]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": [
    "text7 = '@UN @UN_Women \"Ethics are built right into the ideals and objectives of the United Nations\" \\\n",
    "#UNSG @ NY Society for Ethical Culture bit.ly/2guVelr'\n",
    "text8 = text7.split(' ')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['@UN', '@UN_Women']"
      ]
     },
     "execution_count": 16,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "import re # import re - a module that provides support for regular expressions\n",
    "\n",
    "[w for w in text8 if re.search('@[A-Za-z0-9_]+', w)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
